########## PYTHON EXAMPLES ######### # NOTE: The code was executed using Python version 3.11.40. # Using a different Python version may require upgrading or downgrading specific libraries, depending on library dependencies. # Libraries can be installed using the command: pip install # Example: pip install pandas -> installs the pandas library, which must be done before importing a library using the import command: import pandas # Specify working directory import os # load os library path = '/Impact_Evaluation' # the path should be changed by the user os.chdir(path) # change working directory to path # 1 import pandas as pd # load pandas library import statsmodels.api as sm # load statsmodels library df = pd.read_csv('data/coffeeleaflet.csv') # load coffeeleaflet data df = df.loc[:, ['treatment', 'awarewaste']] # only keep columns of interest df = df.dropna() # drop missing observations D = df['treatment'] # define treatment D = sm.add_constant(D) # add a constant Y = df['awarewaste'] # define outcome results = sm.OLS(Y, D).fit(cov_type = 'HC0') # estimate impact print(results.summary()) # show results # 2 import pandas as pd # load pandas library import numpy as np # load numpy library import statsmodels.api as sm # load statsmodels library import matplotlib.pyplot as plt # load matplotlib library df = pd.read_csv('data/marketing.csv') # load marketing data Y = df['sales'] # define outcome D = df['newspaper'] # define treatment x_axis = np.arange(min(D), max(D)) # define points where we evaluate results = sm.nonparametric.KernelReg(Y,D,'c', reg_type='lc').fit(x_axis) #kernel plt.plot(x_axis, results[0], color='black') # plot average sales plt.xlabel('newspaper') # label x axis plt.ylabel('sales') # label y axis plt.ylim([12, 26]) # setting y-axis range plt.show() # show the plot plt.plot(x_axis, results[1], color='black') # plot impact plt.xlabel('newspaper') # label x axis plt.ylabel('marginal effects') # label y axis plt.show() # show the plot # 3 import pandas as pd # load pandas library import statsmodels.api as sm # load statsmodels library df = pd.read_csv('data/coupon.csv') # load coupon data X = df.loc[:, df.columns != 'dailyspending'] # define regressors X = sm.add_constant(X) # add a constant Y = df['dailyspending'] # define outcome results = sm.OLS(Y, X).fit(cov_type = 'HC0') # run linear regression print(results.summary()) # show results # 4 from causalinference import CausalModel # load causalmodel method import pandas as pd # load pandas library import numpy as np # load numpy library df = pd.read_csv('data/coupon.csv') # load coupon data Y = np.asarray(df['dailyspending']) # select outcome D = np.asarray(df['coupons']) # select treatment X = np.asarray(df.drop(['dailyspending', 'coupons'], axis=1)) # select covariates model = CausalModel(Y,D,X) # create causal model model.est_via_matching(weights='inv', matches=1) # pair matching print(model.estimates) # matching output # 5 import pandas as pd # load pandas library import dowhy as dw # load dowhy library df = pd.read_csv('data/coupon.csv') # load coupon data X = df.drop(['dailyspending', 'coupons'], axis = 1) # select covariates model = dw.CausalModel(data = df, # create model treatment = 'coupons', # specify treatment outcome = 'dailyspending', # specify outcome common_causes = list(X.columns)) # specify covariates identified_estimand = model.identify_effect() # identify the effect estimate = model.estimate_effect(identified_estimand, # IPW method_name = 'backdoor.propensity_score_weighting', target_units = 'ate', method_params = {'weighting_scheme': 'ips_weight'}) print(estimate.value) # show impact print(estimate.test_stat_significance()) # show p-value # 6 import pandas as pd # load pandas library import doubleml as dml # load doubleml library from sklearn import linear_model # import models from sklearn df = pd.read_csv('data/coupon.csv') # load coupon data X = df.drop(['dailyspending', 'coupons'], axis=1) # select covariates dml_data = dml.DoubleMLData(df, # create object from coupon data y_col = 'dailyspending', # define outcome d_cols = 'coupons', # define intervention x_cols = list(X.columns.values)) # define covariates ml_l = linear_model.LinearRegression() # define model for outcome ml_m = linear_model.LogisticRegression() # define model for intervention dr = dml.DoubleMLPLR(dml_data, # DR estimation ml_l, ml_m).fit() print(dr.summary) # show the results # 7 import pandas as pd # load pandas library import doubleml as dml # load doubleml library from sklearn import linear_model # load from sklearn df = pd.read_csv('data/coupon.csv') # load coupon data X = df.drop(['dailyspending', 'coupons'], axis = 1) # define covariates dml_data = dml.DoubleMLData(df, # create data y_col = 'dailyspending', # define outcome d_cols = 'coupons', # define intervention x_cols = list(X.columns.values)) # define covariates ml_l = linear_model.LassoCV() # learner for outcome ml_m = linear_model.LogisticRegressionCV(penalty = 'l1', # learner for treatment solver = 'saga', # solver max_iter = 350, # max. iterations Cs = 1, # inverse regularization tol = 0.01) # tolerance for stopping dml_lasso = dml.DoubleMLPLR(dml_data, # double machine learning ml_l, # outcome model ml_m, # treatment model n_folds = 3).fit() # number of folds print(dml_lasso.summary) # show the results # 8 import pandas as pd # load pandas library import matplotlib.pyplot as plt # load matplotlib library from econml.dml import CausalForestDML # load CausalForest from econml from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier # load base models df = pd.read_csv('data/coupon.csv') # load coupon data df.columns = df.columns.astype(str) # convert column names to strings Y = df[['dailyspending']] # define outcome as DataFrame D = df[['coupons']] # define treatment as DataFrame X = df.drop(['dailyspending', 'coupons'], axis=1) # define covariates est = CausalForestDML(criterion='het', # use criterion for heterogeneity n_estimators=500, # number of trees min_samples_leaf=10, # minimum samples per leaf max_depth=10, # maximum tree depth max_samples=0.5, # subsample ratio discrete_treatment=True, # binary treatment model_y=RandomForestRegressor(max_features='sqrt'), # base model for outcome model_t=RandomForestClassifier()) # base model for treatment est.fit(Y=Y, T=D, X=X, W=X) # fit causal forest cateX = est.effect(X) # compute CATEs lb, ub = est.effect_interval(X, alpha=0.05) # confidence intervals via bootstrap cates = pd.concat([pd.DataFrame(cateX), # combine CATEs pd.DataFrame(lb), # lower bounds pd.DataFrame(ub)], axis=1) # upper bounds cates.columns = ['CATE', 'Lower Bound', 'Upper Bound'] # rename columns print(cates) # print CATE table plt.hist(cateX, color='gray', bins=16, rwidth=0.9) # plot histogram of CATEs plt.xlabel('Predictions') # set x-axis label plt.ylabel('Frequency') # set y-axis label plt.title('Histogram of CATEs') # set plot title plt.show() # show plot # 9 import pandas as pd # load pandas library import numpy as np # load numpy library import doubleml as dml # load doubleml library from sklearn import ensemble # load ensemble from sklearn df = pd.read_csv('data/coupon.csv') # load coupon data X = df.drop(['dailyspending', 'coupons'], axis = 1) # select covariates dml_data = dml.DoubleMLData(df, # create data y_col = 'dailyspending', # define outcome d_cols = 'coupons', # define intervention x_cols = list(X.columns.values)) # define covariates ml_g = ensemble.RandomForestRegressor(max_features = 'sqrt') # outcome model ml_m = ensemble.RandomForestClassifier() # intervention model out = dml.DoubleMLIRM(dml_data, # double machine learning ml_g, # learner for outcome ml_m, # learner for intervention normalize_ipw = True, # normalize weights trimming_rule = 'truncate', # trimming approach trimming_threshold = 0.01).fit() # threshold for trimming df['intercept'] = np.ones(len(df)) # create constant cateX = df.loc[:, ['intercept', 'dailyspending_preperiod']] # X for heterogeneity CATEs = out.cate(cateX) # estimate effect heterogeneity print(CATEs) # CATEs by past spending # 10 importances = est.feature_importances_ # importances of X in causal forest importances = importances.ravel() if importances.ndim > 1 else importances feature_names = X.columns # extract names of X importance_df = pd.DataFrame({ 'Feature': feature_names, 'Importance': importances # use flattened importances }) print(importance_df) # report importances of X # 11 policy_tree = out.policy_tree(features = X, depth = 2) # policies for 4 subgroups policy_tree.plot_tree() # tree with optimal policy # 12 import statsmodels.api as sm # load statsmodels library import linearmodels.iv.model as lm # load linearmodels library import pandas as pd # load pandas library df = pd.read_csv('data/c401k.csv') # load c401k data df = sm.add_constant(data=df, prepend=False) # add constant ivreg = lm.IV2SLS(dependent = df['nettfa'], # outcome variable endog = df['p401k'], # intervention exog = df['const'], # covariates instruments = df['e401k']) # instrument LATE = ivreg.fit(cov_type = 'robust') # run instrument regression LATE.summary # results # 13 import pandas as pd # load pandas library from sklearn.linear_model import LogisticRegressionCV # load LogisticRegressionCV from econml.iv.dr import ForestDRIV # load ForestDRIV import math # load math library df = pd.read_csv('data/c401k.csv') # load c401k data X = df.iloc[:, 4:11] # select covariates Y = df.iloc[:, 1] # select outcome W = df.iloc[:, 2] # select treatment Z = df.iloc[:, 3] # select instrument ivforest = ForestDRIV(n_estimators = 2000, # IV forest min_samples_leaf = 5, # specify min samples leaf max_depth = min(round(math.sqrt(len(X.columns)))+20, len(X.columns)), model_t_xw = LogisticRegressionCV(max_iter = 350), discrete_treatment = True) # discrete intervention ivforest.fit(Y = Y, T = W, Z = Z, X = X) # fit IV forest with data ATE = ivforest.ate(X) # compute ATE INF = ivforest.ate_inference(X) # compute p-value print(ATE) # show ATE print(INF) # show p-value # 14 from rdrobust import rdrobust, rdplot # load rdrobust library import pandas as pd # load pandas library df = pd.read_csv('data/indh.csv') # load indh data Y = df.iloc[:, 0] # outcome variable X = df.iloc[:, 1] # running variable results = rdrobust(y=Y, x=X, c=30) # RDD (threshold: 30) print(results) # show impact rdplot(y=Y, x=X, c=30) # plot outcome # 15 import pandas as pd # load pandas library import numpy as np # load numpy library import doubleml as dml # load doubleml library from sklearn import linear_model # load linear_model df = pd.read_csv('data/nsw_long.csv') # load nsw_long data df = df.dropna() # drop missings df['time'] = np.where(df['year'] == 1975, 0, 1) # create time variable df['const'] = np.ones(len(df)) # create constant term dml_data = dml.DoubleMLData(df, # create data y_col = 're', # define outcome d_cols = 'treated', # define intervention x_cols = 'const', # define covariates t_col = 'time') # define time ml_g = linear_model.LinearRegression() # define outcome model ml_m = linear_model.LogisticRegression() # define intervention model did = dml.DoubleMLDIDCS(dml_data, # DiD ml_g, # outcome model ml_m).fit() # intervention model print(did) # show results # 16 x = df.loc[:, ['educ', 'nodegree', 'age', 'married']] #define covariates dml_data = dml.DoubleMLData(df, # create data y_col = 're', # define outcome d_cols = 'treated', # define intervention x_cols = list(x.columns.values), # define covariates t_col = 'time') # define time ml_g = linear_model.LinearRegression() # define outcome model ml_m = linear_model.LogisticRegression() # define intervention model did = dml.DoubleMLDIDCS(dml_data, # DiD ml_g, # outcome model ml_m).fit() # intervention model print(did) # show results # 17 import pandas as pd # load pandas library import matplotlib.pyplot as plt # load matplotlib library from synthdid.synthdid import Synthdid as sdid # load synthdid library from synthdid.get_data import california_prop99 # load california data out = sdid(california_prop99(), # synthetic did on data unit = 'State', # define unit time = 'Year', # define time treatment = 'treated', # define intervention outcome = 'PacksPerCapita') # define outcome out = out.fit().vcov(method = 'placebo') # placebo se out.summary().summary2 # show results out.plot_outcomes() # plot results plt.show() # show the plot